seqlength = 4096
batchsize = 1
accumulate_steps = 44
train_tokens = 100000000
theoryflops = 989000000000000.0
epochs = 1
flashattn = True
recompute = False
tensor_parallel = 8
pipeline_parallel = 4